from IPython.display import display, HTML
import pandas as pd
import sqlite3
from sqlite3 import Error
def create_connection(db_file, delete_db=False):
import os
if delete_db and os.path.exists(db_file):
os.remove(db_file)
conn = None
try:
conn = sqlite3.connect(db_file)
conn.execute("PRAGMA foreign_keys = 1")
except Error as e:
print(e)
return conn
def create_table(conn, create_table_sql, drop_table_name=None):
if drop_table_name: # You can optionally pass drop_table_name to drop the table.
try:
c = conn.cursor()
c.execute("""DROP TABLE IF EXISTS %s""" % (drop_table_name))
except Error as e:
print(e)
try:
c = conn.cursor()
c.execute(create_table_sql)
except Error as e:
print(e)
def execute_sql_statement(sql_statement, conn):
cur = conn.cursor()
cur.execute(sql_statement)
rows = cur.fetchall()
return rows
conn = create_connection('la_city_crime.db', delete_db=True)
#Read dataset
df = pd.read_csv("Arrest_data.csv")
df = df.dropna()
print(len(df))
print(df.columns)
#df['Charge Group Code'].value_counts()
/usr/local/lib/python3.9/site-packages/IPython/core/interactiveshell.py:3457: DtypeWarning: Columns (21,23) have mixed types.Specify dtype option on import or set low_memory=False. exec(code_obj, self.user_global_ns, self.user_ns)
414390
Index(['Report ID', 'Report Type', 'Arrest Date', 'Time', 'Area ID',
'Area Name', 'Reporting District', 'Age', 'Sex Code', 'Descent Code',
'Charge Group Code', 'Charge Group Description', 'Arrest Type Code',
'Charge', 'Charge Description', 'Disposition Description', 'Address',
'Cross Street', 'LAT', 'LON', 'Location', 'Booking Date',
'Booking Time', 'Booking Location', 'Booking Location Code'],
dtype='object')
sql_charge_detals = "CREATE TABLE ChargeDetails(ChargeGroupCode INTEGER NOT NULL PRIMARY KEY, ChargeGroupDesc TEXT NOT NULL, Charge TEXT NOT NULL, ChargeDesc TEXT NOT NULL, ChargeCount INTEGER NOT NULL);"
create_table(conn, sql_charge_detals, drop_table_name='ChargeDetails')
rows = []
charge_group_code = []
charge_group_desc = []
charge_code = []
charge_desc = []
charge_count = {}
df_charge_data = df[['Charge Group Code', 'Charge Group Description', 'Charge', 'Charge Description']]
print(df_charge_data.head(), len(df_charge_data), df_charge_data.index)
count = 0
for i in df_charge_data.index:
count+=1
try:
charge_count[int(df_charge_data['Charge Group Code'][i])] +=1
except KeyError:
charge_count[int(df_charge_data['Charge Group Code'][i])] = 1
if(int(df_charge_data['Charge Group Code'][i]) not in charge_group_code):
charge_group_code += [int(df_charge_data['Charge Group Code'][i])]
charge_group_desc += [df_charge_data['Charge Group Description'][i]]
charge_code += [df_charge_data['Charge'][i] + ":"]
charge_desc += [df_charge_data['Charge Description'][i] + ":"]
else:
index = charge_group_code.index(int(df_charge_data['Charge Group Code'][i]))
if(df_charge_data['Charge'][i] not in charge_code[index]):
charge_code[index] += df_charge_data['Charge'][i] + ":"
charge_desc[index] += df_charge_data['Charge Description'][i] + ":"
print(charge_count)
for i in range(0, len(charge_group_code)):
rows += [(int(charge_group_code[i]), charge_group_desc[i], charge_code[i][0:-1], charge_desc[i][0:-1],charge_count[int(charge_group_code[i])],)]
print(charge_group_code)
#print(len(charge_group_code), len(list(set(charge_group_code))))
#print(rows)
sql = """INSERT INTO ChargeDetails VALUES (?,?,?,?,?);"""
cur = conn.cursor()
cur.executemany(sql, rows)
conn.commit()
print(count)
Charge Group Code Charge Group Description Charge \
331 5.0 Burglary 459PC
673 6.0 Larceny 666PC
784 5.0 Burglary 459PC
2256 10.0 Fraud/Embezzlement 10980(C)2WI
3108 7.0 Vehicle Theft 10851(A)VC
Charge Description
331 BURGLARY
673 PETTY THEFT W/SPECIFIED PRIOR CONVICTION
784 BURGLARY
2256 OBTAIN/ETC AID BY FALSE STMENT/ETC > $400
3108 TAKE VEHICLE W/O OWNER'S CONSENT 414390 Int64Index([ 331, 673, 784, 2256, 3108, 4675, 4693,
4922, 5175, 44678,
...
1322791, 1322792, 1322793, 1322795, 1322798, 1322805, 1322808,
1322814, 1322817, 1322819],
dtype='int64', length=414390)
{5: 7650, 6: 10531, 10: 3771, 7: 15495, 12: 14968, 9: 2681, 24: 82146, 16: 90302, 3: 11811, 13: 24266, 22: 66380, 8: 16598, 15: 1240, 23: 33213, 4: 20679, 17: 525, 18: 1791, 14: 3169, 2: 514, 11: 3155, 20: 1909, 1: 751, 19: 279, 25: 133, 21: 190, 27: 41, 26: 202}
[5, 6, 10, 7, 12, 9, 24, 16, 3, 13, 22, 8, 15, 23, 4, 17, 18, 14, 2, 11, 20, 1, 19, 25, 21, 27, 26]
414390
import plotly.graph_objects as go
charge_by_group_count_data = execute_sql_statement("SELECT ChargeGroupDesc, ChargeCount FROM ChargeDetails ORDER BY ChargeCount DESC", conn)
charges = [row[0] for row in charge_by_group_count_data]
counts = [row[1] for row in charge_by_group_count_data]
fig = go.Figure([go.Bar(x=charges, y=counts,text=counts,
textposition='auto')])
fig.update_layout(title_text='Commonly Committed Crimes In LA City')
fig.show()
fig = go.Figure(data=[go.Pie(labels=charges, values=counts)])
fig.show()
1) Narcotic Drug Laws
2) Miscellaneous Other Violations
3) Driving Under Influence
4) Moving Traffic Violations
5) Prostitution/Allied
sql_location_details = "CREATE TABLE LocationDetails(AreaID INTEGER NOT NULL PRIMARY KEY, AreaName TEXT NOT NULL, ReportingDistrict TEXT NOT NULL, Address TEXT NOT NULL, CrossStreet TEXT NOT NULL, Location TEXT NOT NULL);"
create_table(conn, sql_location_details, drop_table_name='LocationDetails')
rows = []
area_id = []
area_name = []
reporting_district = []
address = []
cross_street = []
location = []
df_area_data = df[['Area ID', 'Area Name', 'Reporting District', 'Address', 'Cross Street', 'Location']]
for i in df_area_data.index:
#print(i)
if(int(df_area_data['Area ID'][i]) not in area_id):
area_id += [int(df_area_data['Area ID'][i])]
area_name += [df_area_data['Area Name'][i]]
reporting_district += [str(df_area_data['Reporting District'][i]) + ":"]
address += [str(df_area_data['Address'][i].replace(" ",'')) + ":"]
cross_street += [df_area_data['Cross Street'][i].replace(" ",'') + ":"]
location += [df_area_data['Location'][i] + ":"]
else:
index = area_id.index(int(df_area_data['Area ID'][i]))
address[index] += str(df_area_data['Address'][i].replace(" ",'')) + ":"
cross_street[index] += df_area_data['Cross Street'][i].replace(" ",'') + ":"
location[index] += df_area_data['Location'][i] + ":"
if(str(df_area_data['Reporting District'][i]) not in reporting_district[index]):
reporting_district[index] += str(df_area_data['Reporting District'][i]) + ":"
print(len(area_id), len(address), len(reporting_district), len(location), len(cross_street), len(area_name))
for i in range(0, len(area_id)):
rows += [(int(area_id[i]), area_name[i], reporting_district[i][0:-1], address[i][0:-1], cross_street[i][0:-1], location[i][0:-1],)]
#print(rows[0])
sql = """INSERT INTO LocationDetails VALUES (?,?,?,?,?,?);"""
cur = conn.cursor()
cur.executemany(sql, rows)
conn.commit()
21 21 21 21 21 21
import plotly.graph_objects as go
from collections import Counter
area_data = execute_sql_statement("SELECT * FROM LocationDetails ORDER BY AreaID ASC", conn)
area_name = [row[1] for row in area_data]
area_rep_district = [len(row[2].split(":")) for row in area_data]
areawise_crimes = [len(row[3].split(":")) for row in area_data]
fig = go.Figure([go.Bar(x=area_name, y=area_rep_district,text=area_rep_district,
textposition='auto')])
fig.update_layout(title_text='Areawise reporting districts')
fig.show()
fig = go.Figure([go.Bar(x=area_name, y=areawise_crimes,text=areawise_crimes,
textposition='auto')])
fig.update_layout(title_text='Areawise Crime Stats')
fig.show()
fig = go.Figure(data=[go.Pie(labels=area_name, values=areawise_crimes)])
fig.show()
Central & Hollywood are the locations with most number of unique crime locations - Clearly indicates that there are many possible locations to commit crimes in this locality. However, the number of reporting districts for these 2 locations is just above 50-60% as compared to the location with most number of reporting districts.
Devonshire & Wilshere are two of the locations with the least number of unique crimes spots. However, both these locations have 83 & 84 reporting districts respectively. This could mean one of the 2 things - (1) The high number of reporting districts in these places have kept the crime in check or (2) These 2 areas generally have less crimes taking place and hence the number of reporting districts could be unjustified.
We however think that the first case mentioned above is more probable as we can see that the number of cases on are extremely high in locations with low reporting districts.
Central & Hollywood account for nearly 25% of the unique crimes locations in LA!
We can simply use a metric to see how well managed or not a particular location is by simply taking the crimes in that area and dividing it by the number of reporting districts.
crime_rep_ratio = []
#print(areawise_crimes, area_rep_district, area_name)
for i in range(0, len(areawise_crimes)):
crime_rep_ratio += [areawise_crimes[i]/area_rep_district[i]]
#print(crime_rep_ratio)
fig = go.Figure([go.Bar(x=area_name, y=crime_rep_ratio,text=crime_rep_ratio,
textposition='auto')])
fig.update_layout(title_text='Crime Reporting Ratio For Each Area')
fig.show()
areawise_crime_addresses = [row[3].split(":") for row in area_data]
areastreet_crimes = [Counter(elem).most_common() for elem in areawise_crime_addresses]
area_locations = []
area_location_stats = []
for elem in areastreet_crimes:
temp_locs = []
temp_stats = []
for pair in elem:
temp_locs += [pair[0]]
temp_stats += [pair[1]]
area_locations += [temp_locs]
area_location_stats += [temp_stats]
crime_counts = []
crime_areas = []
crime_locations = []
crimes_1 = []
crimes_area_1 = []
crimes_2 = []
crimes_area_2 = []
crimes_3 = []
crimes_area_3 = []
for i in range(0,len(area_location_stats)):
for j in range(0,3):
crime_areas += [area_name[i]]
crime_counts += [area_location_stats[i][j]]
crime_locations += [area_locations[i][j]]
if(j==0):
crimes_1 += [area_location_stats[i][j]]
crimes_area_1 += [area_locations[i][j] + " (" + area_name[i] + ")"]
elif(j==1):
crimes_2 += [area_location_stats[i][j]]
crimes_area_2 += [area_locations[i][j] + " (" + area_name[i] + ")"]
elif(j==2):
crimes_3 += [area_location_stats[i][j]]
crimes_area_3 += [area_locations[i][j] + " (" + area_name[i] + ")"]
if(j==3):
break
#print(crimes_area_1)
y=crimes_area_1
fig = go.Figure()
fig.add_trace(go.Bar(y=y, x=crimes_1, name='Most Crimes',orientation='h'))
fig.add_trace(go.Bar(y=y, x=crimes_2, name='2nd Most Crimes',orientation='h'))
fig.add_trace(go.Bar(y=y, x=crimes_3, name='3rd Most Crimes',orientation='h'))
fig.update_layout(barmode='stack', yaxis={'categoryorder':'total ascending'}, width=1000, height=700)
fig.show()
x=crimes_area_1
fig = go.Figure(go.Bar(x=x, y=crimes_1, name='Most Crimes'))
fig.update_layout(title="Most Crimes Areawise Streetwise", barmode='stack', xaxis={'categoryorder':'total descending'})
fig.show()
x=crimes_area_2
fig = go.Figure(go.Bar(x=x, y=crimes_2, name='Most Crimes'))
fig.update_layout(title="2nd Most Crimes Areawise Streetwise", barmode='stack', xaxis={'categoryorder':'total descending'})
fig.show()
x=crimes_area_3
fig = go.Figure(go.Bar(x=x, y=crimes_3, name='Most Crimes'))
fig.update_layout(title="3rd Most Crimes Areawise Streetwise", barmode='stack', xaxis={'categoryorder':'total descending'})
fig.show()
As we can see from the first horizontal bar chart, the locations are segregated by streets and crime count. Central, Hollywood and Van Nuys account for the top 3 most crimes when only considering the top 3 streets for each locality.
6th Street, Hollywood and Sepulveda are the streets with highest crime counts in each of the locations of Central, Hollywood and Van Nuys. Similarly streets with 2nd most and 3rd most crimes in each location are also depicted above
sql_arrest = '''create table ArrestDetails(ReportID INTEGER NOT NULL PRIMARY KEY,
ReportType TEXT NOT NULL,
ArrestDate TEXT NOT NULL,
ArrestTime INTEGER NOT NULL,
ArrestTypeCode TEXT NOT NULL,
Age INTEGER NOT NULL,
SexCode TEXT NOT NULL,
DescentCode TEXT NOT NULL,
AreaID INTEGER NOT NULL,
ChargeGroupCode INTEGER NOT NULL,
FOREIGN KEY (AreaID) REFERENCES LocationDetails(AreaID),
FOREIGN KEY (ChargeGroupCode) REFERENCES ChargeDetails(ChargeGroupCode)
);
'''
create_table(conn, sql_arrest, drop_table_name = 'ArrestDetails')
rows = []
report_id = []
report_type = []
arrest_date = []
arrest_time = []
arrest_type_code = []
age = []
sex_code = []
descent_code = []
area_id = []
charge_gcode = []
df_arrest_data = df[['Report ID', 'Report Type', 'Arrest Date', 'Time', 'Arrest Type Code', 'Age', 'Sex Code', 'Descent Code', 'Area ID', 'Charge Group Code']]
for i in df_arrest_data.index[0:100000]:
report_id += [int(df_arrest_data['Report ID'][i])]
report_type += [df_arrest_data['Report Type'][i]]
arrest_date += [df_arrest_data['Arrest Date'][i]]
arrest_time += [int(df_arrest_data['Time'][i])]
arrest_type_code += [df_arrest_data['Arrest Type Code'][i]]
age += [int(df_arrest_data['Age'][i])]
sex_code += [df_arrest_data['Sex Code'][i]]
descent_code += [df_arrest_data['Descent Code'][i]]
area_id += [int(df_arrest_data['Area ID'][i])]
charge_gcode += [int(df_arrest_data['Charge Group Code'][i])]
for i in range(0, len(report_id)):
rows += [(int(report_id[i]), report_type[i], arrest_date[i], int(arrest_time[i]), arrest_type_code[i], int(age[i]), sex_code[i], descent_code[i], int(area_id[i]), int(charge_gcode[i]))]
sql = """INSERT INTO ArrestDetails VALUES (?,?,?,?,?,?,?,?,?,?);"""
cur = conn.cursor()
cur.executemany(sql, rows)
conn.commit()
arrests_data = execute_sql_statement("SELECT * FROM ArrestDetails;", conn)
charges_data = execute_sql_statement("SELECT * FROM ChargeDetails;", conn)
results = execute_sql_statement("SELECT ArrestDetails.ArrestTime, LocationDetails.AreaName, ChargeDetails.ChargeGroupDesc FROM ArrestDetails JOIN LocationDetails ON ArrestDetails.AreaID=LocationDetails.AreaID JOIN ChargeDetails ON ArrestDetails.ChargeGroupCode=ChargeDetails.ChargeGroupCode;", conn)
area_time_stats = {}
charge_time_stats = {}
area_charge_stats = {}
for area in area_data:
area_time_stats[area[1]] = {"1st Phase" : 0, "2nd Phase" : 0, "3rd Phase" : 0, "4th Phase" : 0}
for charge in charges_data:
charge_time_stats[charge[1]] = {"1st Phase" : 0, "2nd Phase" : 0, "3rd Phase" : 0, "4th Phase" : 0}
for i in range(0, len(area_data)):
area_charge_stats[area_data[i][1]] = {}
for j in range(0, len(charges_data)):
area_charge_stats[area_data[i][1]][charges_data[j][1]] = 0
timewise_crimes = [0, 0, 0, 0]
for result in results:
#print(area_time_stats)
area_charge_stats[result[1]][result[2]] += 1
if(result[0]>=0 and result[0]<600):
area_time_stats[result[1]]["1st Phase"] += 1
charge_time_stats[result[2]]["1st Phase"] += 1
timewise_crimes[0] += 1
elif(result[0]>=600 and result[0]<1200):
area_time_stats[result[1]]["2nd Phase"] += 1
charge_time_stats[result[2]]["2nd Phase"] += 1
timewise_crimes[1] += 1
elif(result[0]>=1200 and result[0]<1800):
area_time_stats[result[1]]["3rd Phase"] += 1
charge_time_stats[result[2]]["3rd Phase"] += 1
timewise_crimes[2] += 1
elif(result[0]>=1800 and result[0]<2400):
area_time_stats[result[1]]["4th Phase"] += 1
charge_time_stats[result[2]]["4th Phase"] += 1
timewise_crimes[3] += 1
central_stats = [area_time_stats["Central"]["1st Phase"], area_time_stats["Central"]["2nd Phase"], area_time_stats["Central"]["3rd Phase"], area_time_stats["Central"]["4th Phase"]]
hollywood_stats = [area_time_stats["Hollywood"]["1st Phase"], area_time_stats["Hollywood"]["2nd Phase"], area_time_stats["Hollywood"]["3rd Phase"], area_time_stats["Hollywood"]["4th Phase"]]
fig = go.Figure(data=[go.Pie(labels=["00:00hrs to 06:00hrs","06:00hrs to 12:00hrs","12:00hrs to 18:00hrs","18:00hrs to 24:00hrs"], values=timewise_crimes)])
fig.update_layout(title="Crimes by Time In LA")
fig.show()
fig = go.Figure(data=[go.Pie(labels=["00:00hrs to 06:00hrs","06:00hrs to 12:00hrs","12:00hrs to 18:00hrs","18:00hrs to 24:00hrs"], values=central_stats)])
fig.update_layout(title="Crimes by Time In Central Area In LA")
fig.show()
fig = go.Figure(data=[go.Pie(labels=["00:00hrs to 06:00hrs","06:00hrs to 12:00hrs","12:00hrs to 18:00hrs","18:00hrs to 24:00hrs"], values=hollywood_stats)])
fig.update_layout(title="Crimes by Time In Hollywood Area In LA")
fig.show()
In central, over 2/3rds of the crimes take place between 6AM & 6PM and close to 1/4th take place from 6PM-12 midnight. The number of crimes during the time after midnight is fairly low.
This however, is not the case with Hollywood. Hollywood has over 1/3rds of crimes taking place between the 12 midnight - 6AM period. This could possibly be because of how happening the location is. Another 1/3rd of the crimes take place between 6PM-Midnight! People here probably love the dark!
ndl_charge_stats = [charge_time_stats["Narcotic Drug Laws"]["1st Phase"], charge_time_stats["Narcotic Drug Laws"]["2nd Phase"], charge_time_stats["Narcotic Drug Laws"]["3rd Phase"], charge_time_stats["Narcotic Drug Laws"]["4th Phase"]]
fig = go.Figure(data=[go.Pie(labels=["00:00hrs to 06:00hrs","06:00hrs to 12:00hrs","12:00hrs to 18:00hrs","18:00hrs to 24:00hrs"], values=ndl_charge_stats)])
fig.update_layout(title="Narcotic Drug Crimes by Time In LA")
fig.show()
dui_charge_stats = [charge_time_stats["Driving Under Influence"]["1st Phase"], charge_time_stats["Driving Under Influence"]["2nd Phase"], charge_time_stats["Driving Under Influence"]["3rd Phase"], charge_time_stats["Driving Under Influence"]["4th Phase"]]
fig = go.Figure(data=[go.Pie(labels=["00:00hrs to 06:00hrs","06:00hrs to 12:00hrs","12:00hrs to 18:00hrs","18:00hrs to 24:00hrs"], values=dui_charge_stats)])
fig.update_layout(title="DUI Charges by Time In LA")
fig.show()
mtv_charge_stats = [charge_time_stats["Moving Traffic Violations"]["1st Phase"], charge_time_stats["Moving Traffic Violations"]["2nd Phase"], charge_time_stats["Moving Traffic Violations"]["3rd Phase"], charge_time_stats["Moving Traffic Violations"]["4th Phase"]]
fig = go.Figure(data=[go.Pie(labels=["00:00hrs to 06:00hrs","06:00hrs to 12:00hrs","12:00hrs to 18:00hrs","18:00hrs to 24:00hrs"], values=mtv_charge_stats)])
fig.update_layout(title="Moving Traffic Violation Charges by Time In LA")
fig.show()
We can see that over 75% of the narcotic crimes are committed between 12noon and 12midnight!
Over 80% of people caught in DUI are usually caught between 6PM & 6AM.
Coming to narcotics cases, most of them are booked between 12noon-12midnight. Usually this is the case because people tend to buy narcotics in the day and do it in the night, forcing peddlers to come out during the daytime to supply this to those who want it.
Driving under influence on the other hand is usually charged between 6PM & 6AM. This makes sense too as people usually tend to drink post 6PM and perhaps get caught more often during these time periods.
Looking at the Moving Traffic Violations pie chart, we see that almost half the charges booked were between 6PM and 12 Midnight. This could mainly be because evening times are generally considered rush hours and that is one time in the day where there is a lot of moving traffic together on the road at once. The MTV charges booked during the time slot between 12 midnight and 6AM are the least and just account for hardly 15% of the total charges.
Arrested people of ages greater than 75 are outliers.
age = execute_sql_statement("SELECT Age FROM ArrestDetails;", conn)
import plotly.express as px
fig = px.box(age)
fig.show()
chg_code = execute_sql_statement("SELECT ReportID, ChargeGroupCode FROM ArrestDetails;", conn)
# chg_df = pd.DataFrame(chg_code, columns=['Report ID', 'Charge group code'])
# import plotly.graph_objects as px
# plot = px.Figure(data=[px.Scatter(
# x = chg_df['Report ID'],
# y = chg_df['Charge group code'],
# mode = 'markers',)
# ])
# plot.update_layout(
# title='Charge group codes by report',
# xaxis_title="Report ID",
# yaxis_title="Charge group code")
# plot.show()
arrest_date = execute_sql_statement("SELECT ReportID, ArrestDate FROM ArrestDetails;", conn)
arrest_date_df = pd.DataFrame(arrest_date, columns=['Report ID', 'Date'])
arrest_date_df
| Report ID | Date | |
|---|---|---|
| 0 | 1027138 | 01/19/2010 |
| 1 | 1089278 | 01/19/2010 |
| 2 | 1390123 | 01/07/2011 |
| 3 | 1556591 | 08/13/2013 |
| 4 | 1590033 | 01/19/2010 |
| ... | ... | ... |
| 99995 | 3509751 | 03/28/2013 |
| 99996 | 3550958 | 05/06/2013 |
| 99997 | 4050074 | 08/02/2014 |
| 99998 | 4118886 | 10/09/2014 |
| 99999 | 4442940 | 09/13/2015 |
100000 rows × 2 columns
arrest_date_df['Split Date'] = arrest_date_df['Date'].str.split('/')
arrest_date_df
| Report ID | Date | Split Date | |
|---|---|---|---|
| 0 | 1027138 | 01/19/2010 | [01, 19, 2010] |
| 1 | 1089278 | 01/19/2010 | [01, 19, 2010] |
| 2 | 1390123 | 01/07/2011 | [01, 07, 2011] |
| 3 | 1556591 | 08/13/2013 | [08, 13, 2013] |
| 4 | 1590033 | 01/19/2010 | [01, 19, 2010] |
| ... | ... | ... | ... |
| 99995 | 3509751 | 03/28/2013 | [03, 28, 2013] |
| 99996 | 3550958 | 05/06/2013 | [05, 06, 2013] |
| 99997 | 4050074 | 08/02/2014 | [08, 02, 2014] |
| 99998 | 4118886 | 10/09/2014 | [10, 09, 2014] |
| 99999 | 4442940 | 09/13/2015 | [09, 13, 2015] |
100000 rows × 3 columns
l = []
d = []
val = arrest_date_df['Split Date'].values.tolist()
for i in range(100000):
l.append(val[i][0])
d.append(val[i][1])
arrest_date_df['Month'] = l
arrest_date_df['Day'] = d
arrest_date_df
| Report ID | Date | Split Date | Month | Day | |
|---|---|---|---|---|---|
| 0 | 1027138 | 01/19/2010 | [01, 19, 2010] | 01 | 19 |
| 1 | 1089278 | 01/19/2010 | [01, 19, 2010] | 01 | 19 |
| 2 | 1390123 | 01/07/2011 | [01, 07, 2011] | 01 | 07 |
| 3 | 1556591 | 08/13/2013 | [08, 13, 2013] | 08 | 13 |
| 4 | 1590033 | 01/19/2010 | [01, 19, 2010] | 01 | 19 |
| ... | ... | ... | ... | ... | ... |
| 99995 | 3509751 | 03/28/2013 | [03, 28, 2013] | 03 | 28 |
| 99996 | 3550958 | 05/06/2013 | [05, 06, 2013] | 05 | 06 |
| 99997 | 4050074 | 08/02/2014 | [08, 02, 2014] | 08 | 02 |
| 99998 | 4118886 | 10/09/2014 | [10, 09, 2014] | 10 | 09 |
| 99999 | 4442940 | 09/13/2015 | [09, 13, 2015] | 09 | 13 |
100000 rows × 5 columns
mth_cnts = pd.DataFrame(arrest_date_df['Month'].value_counts())
day_cnts = pd.DataFrame(arrest_date_df['Day'].value_counts())
mth_cnts
| Month | |
|---|---|
| 01 | 12485 |
| 02 | 8682 |
| 07 | 8623 |
| 08 | 8522 |
| 03 | 8342 |
| 09 | 8154 |
| 05 | 7933 |
| 10 | 7903 |
| 06 | 7701 |
| 04 | 7476 |
| 11 | 7217 |
| 12 | 6962 |
day_cnts.sort_index(inplace=True)
day_cnts
| Day | |
|---|---|
| 01 | 3554 |
| 02 | 3630 |
| 03 | 3588 |
| 04 | 3588 |
| 05 | 3277 |
| 06 | 3347 |
| 07 | 3609 |
| 08 | 3460 |
| 09 | 3541 |
| 10 | 3506 |
| 11 | 3421 |
| 12 | 3400 |
| 13 | 3352 |
| 14 | 3191 |
| 15 | 3235 |
| 16 | 3232 |
| 17 | 3203 |
| 18 | 3086 |
| 19 | 3091 |
| 20 | 3089 |
| 21 | 3228 |
| 22 | 3265 |
| 23 | 3232 |
| 24 | 2826 |
| 25 | 2722 |
| 26 | 3093 |
| 27 | 3206 |
| 28 | 3272 |
| 29 | 2904 |
| 30 | 3063 |
| 31 | 1789 |
import plotly.graph_objects as px
plot = px.Figure(data=[px.Scatter(
x = mth_cnts.index,
y = mth_cnts['Month'],
mode = 'markers',)
])
plot.update_layout(
title='Number of arrests by the month of the year',
xaxis_title="Month",
yaxis_title="Number of arrests",)
plot.show()
Number of arrests in the month of january is an outlier. It is anomalously high as compared to the other months of the year.
plot = px.Figure(data=[px.Scatter(
x = day_cnts.index,
y = day_cnts['Day'],
mode = 'markers',
)
])
plot.update_layout(
title='Number of arrests by day of the month',
xaxis_title="Day",
yaxis_title="Number of arrests",)
plot.show()
Number of arrests made on the 31st of a month are anomalously low as compared to other days.
time = execute_sql_statement("SELECT ReportID, ArrestTime FROM ArrestDetails;", conn)
arrest_time_df = pd.DataFrame(time, columns=['Report ID', 'Time'])
time_c = pd.DataFrame(arrest_time_df['Time'].value_counts())
time_c.sort_index(inplace=True)
plot = px.Figure(data=[px.Scatter(
x = time_c.index,
y = time_c['Time'],
mode = 'markers',)
])
plot.update_layout(
title='Times of Crime',
xaxis_title="Time",
yaxis_title="Number of arrests",)
plot.show()
Number of arrests suddenly drops to a minimum at 5:00 AM. As night approaches, the number of arrests linearly increases. The peak can be seen at 8:00 PM. From midnight to early morning, the number of arrests linearly fall.